Today we’ll be looking at data on the presence and absence of the short-finned eel (Anguilla australis) at a number of sites in New Zealand. These data come from Elith, J., Leathwick, J.R., and Hastie, T., 2009. A working guide to boosted regression trees. Journal of Animal Ecology 77: 802-81.
Codebook:
presence - presence or absence of Anguilla australis at the sampling location (0=absent, 1=present)LocSed - weighted average of proportional cover of bed sediment (1=mud, 2=sand, 3=fine gravel, 4=coarse gravel, 5=cobble, 6=boulder, 7=bedrock)SegSumT - Summer air temperature (degrees C) –>DSDist - Distance to coast (km)DSDam - Presence of known downstream obstructions, mostly damsDSMaxSlope - Maximum downstream slope (degrees)USRainDays - days per month with rain greater than 25 mmUSSlope - average slope in the upstream catchment (degrees)USNative - area with indigenous forest (proportion)Method - fishing method (electric, net, spot, trap, or mixture)load(url("http://www.stat.duke.edu/~cr173/Sta444_Sp17/data/anguilla.Rdata"))
set.seed(20170130)
part = resample_partition(anguilla, c(train=0.75, test=0.25))
anguilla = as.data.frame(part$train)
anguilla_test = as.data.frame(part$test)
g = glm(presence ~ SegSumT, data=anguilla, family=binomial)
summary(g)
##
## Call:
## glm(formula = presence ~ SegSumT, family = binomial, data = anguilla)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4619 -0.6642 -0.3822 -0.1451 2.7510
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.78289 1.54087 -9.594 <2e-16 ***
## SegSumT 0.78727 0.08753 8.995 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 630.04 on 616 degrees of freedom
## Residual deviance: 509.31 on 615 degrees of freedom
## AIC: 513.31
##
## Number of Fisher Scoring iterations: 5
inv_logit = function(x) exp(x)/(1+exp(x))
pred = data_frame(
SegSumT = seq(10, 25, length.out = 1000)
) %>%
add_predictions(g) %>%
mutate(pred = inv_logit(pred))
ggplot(anguilla, aes(x=SegSumT, y=presence)) +
#geom_point() +
geom_jitter(height=0.05, alpha=0.33) +
geom_line(data=pred, aes(y=pred), col='red')
d_g = anguilla %>%
add_predictions(g, var="p_hat") %>%
mutate(p_hat = inv_logit(p_hat)) %>%
mutate(resid = presence - p_hat) %>%
mutate(p_hat_bin = p_hat - (p_hat %% 0.025))
ggplot(d_g, aes(x=p_hat, y=resid)) +
geom_point()
d_g %>%
group_by(p_hat_bin) %>%
summarize(resid_mean = mean(resid), resid_sd = sd(resid)) %>%
ggplot(aes(x=p_hat_bin, y=resid_mean)) +
geom_point()
\[ r_i = \frac{Y_i - E(Y_i)}{Var(Y_i)} = \frac{Y_i - \hat{p}_i}{\hat{p}_i(1-\hat{p}_i)} \]
d_g = d_g %>%
mutate(pearson = (presence - p_hat)/(p_hat*(1-p_hat)))
ggplot(d_g, aes(x=p_hat, y=pearson)) +
geom_point()
d_g %>%
group_by(p_hat_bin) %>%
summarize(pearson_mean = mean(pearson), resid_sd = sd(pearson)) %>%
ggplot(aes(x=p_hat_bin, y=pearson_mean)) +
geom_point()
\[ d_i = \text{sign}(Y_i-\hat{p_i}) \sqrt{ -2 \left(Y_i \log \hat{p}_i+(1-Y_i)\log (Y_i - \hat{p}_i) \right) } \]
d_g = d_g %>%
mutate(deviance = sign(presence-p_hat) * sqrt(-2 * (presence * log(p_hat) + (1-presence) * log(1-p_hat))))
ggplot(d_g, aes(x=p_hat, y=deviance)) +
geom_point()
d_g %>%
group_by(p_hat_bin) %>%
summarize(deviance_mean = mean(deviance), deviance_sd = sd(deviance)) %>%
ggplot(aes(x=p_hat_bin, y=deviance_mean)) +
geom_point()
g
##
## Call: glm(formula = presence ~ SegSumT, family = binomial, data = anguilla)
##
## Coefficients:
## (Intercept) SegSumT
## -14.7829 0.7873
##
## Degrees of Freedom: 616 Total (i.e. Null); 615 Residual
## Null Deviance: 630
## Residual Deviance: 509.3 AIC: 513.3
sum(d_g$deviance^2)
## [1] 509.305
shinyApp(
ui = mainPanel(
plotOutput("plot"),
selectInput("var", label = "Variable", choices = c("resid","deviance","pearson")),
sliderInput("bin_width", label="Bin width", min = 0, max=0.2, step = 0.005, value=0.025)
),
server = function(input, output)
{
output$plot = renderPlot(
d_g %>%
mutate(p_hat_bin = p_hat - (p_hat %% input$bin_width)) %>%
group_by(p_hat_bin) %>%
summarize_(mean = paste0('mean(', input$var, ')')) %>%
ggplot(aes(x=p_hat_bin, y=mean)) +
geom_point()
)
}
)
f = glm(presence~., data=anguilla, family=binomial)
summary(f)
##
## Call:
## glm(formula = presence ~ ., family = binomial, data = anguilla)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.15801 -0.56129 -0.28822 -0.08002 2.85046
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.869669 1.771435 -5.007 5.53e-07 ***
## SegSumT 0.623457 0.097058 6.424 1.33e-10 ***
## DSDist -0.002802 0.002106 -1.330 0.1834
## DSMaxSlope -0.129213 0.068263 -1.893 0.0584 .
## USRainDays -0.525662 0.214254 -2.453 0.0141 *
## USSlope -0.051561 0.025355 -2.034 0.0420 *
## USNative -0.468314 0.463225 -1.011 0.3120
## DSDam -0.878588 0.508636 -1.727 0.0841 .
## Methodmixture -0.066323 0.510633 -0.130 0.8967
## Methodnet -1.162441 0.505540 -2.299 0.0215 *
## Methodspo -1.650816 0.827089 -1.996 0.0459 *
## Methodtrap -2.953097 0.686260 -4.303 1.68e-05 ***
## LocSed -0.213485 0.099421 -2.147 0.0318 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 630.04 on 616 degrees of freedom
## Residual deviance: 434.44 on 604 degrees of freedom
## AIC: 460.44
##
## Number of Fisher Scoring iterations: 6
d_f = anguilla %>%
add_predictions(f, var="p_hat") %>%
mutate(p_hat = inv_logit(p_hat)) %>%
mutate(p_hat_bin = p_hat - (p_hat %% 0.025)) %>%
mutate(
resid = presence - p_hat,
pearson = (presence - p_hat)/(p_hat*(1-p_hat)),
deviance = sign(presence-p_hat) * sqrt(-2 * (presence * log(p_hat) + (1-presence) * log(1-p_hat)))
)
resid_f = d_f %>%
select(p_hat:deviance) %>%
gather(type, value, -p_hat, -p_hat_bin) %>%
mutate(type = factor(type, levels=c("resid","pearson","deviance")))
ggplot(resid_f, aes(x=p_hat, y=value, color=type)) +
geom_point(alpha=0.1) +
facet_wrap(~type, ncol=3, scale="free_y")
resid_f %>%
group_by(type, p_hat_bin) %>%
summarize(mean = mean(value)) %>%
ggplot(aes(x=p_hat_bin, y=mean, color=type)) +
geom_point() +
facet_wrap(~type, ncol=3, scale="free_y")
ggplot(d_f, aes(x=SegSumT, y=deviance, color=as.factor(presence))) + geom_point()
ggplot(d_f, aes(x=DSMaxSlope, y=deviance, color=as.factor(presence))) + geom_point()
Possible fix:
f2 = glm(presence~.+I(SegSumT^2), data=anguilla, family=binomial)
summary(f2)
##
## Call:
## glm(formula = presence ~ . + I(SegSumT^2), family = binomial,
## data = anguilla)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.09241 -0.57666 -0.26048 -0.02316 2.99017
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -63.614787 22.585115 -2.817 0.00485 **
## SegSumT 7.175469 2.666667 2.691 0.00713 **
## DSDist -0.003869 0.002158 -1.793 0.07296 .
## DSMaxSlope -0.128906 0.067778 -1.902 0.05719 .
## USRainDays -0.632217 0.223784 -2.825 0.00473 **
## USSlope -0.066911 0.026264 -2.548 0.01084 *
## USNative -0.283431 0.471036 -0.602 0.54736
## DSDam -1.015057 0.514068 -1.975 0.04832 *
## Methodmixture 0.081875 0.509820 0.161 0.87241
## Methodnet -1.137531 0.502683 -2.263 0.02364 *
## Methodspo -1.537345 0.806835 -1.905 0.05673 .
## Methodtrap -2.895911 0.686317 -4.219 2.45e-05 ***
## LocSed -0.233322 0.099856 -2.337 0.01946 *
## I(SegSumT^2) -0.193490 0.078021 -2.480 0.01314 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 630.04 on 616 degrees of freedom
## Residual deviance: 426.80 on 603 degrees of freedom
## AIC: 454.8
##
## Number of Fisher Scoring iterations: 7
d_f2 = anguilla %>%
add_predictions(f2, var="p_hat") %>%
mutate(p_hat = inv_logit(p_hat)) %>%
mutate(p_hat_bin = p_hat - (p_hat %% 0.025)) %>%
mutate(
resid = presence - p_hat,
pearson = (presence - p_hat)/(p_hat*(1-p_hat)),
deviance = sign(presence-p_hat) * sqrt(-2 * (presence * log(p_hat) + (1-presence) * log(1-p_hat)))
)
rbind(
cbind(d_f, model="linearT"),
cbind(d_f2, model="quadT")
) %>%
ggplot(aes(x=SegSumT, y=deviance, color=as.factor(presence))) + geom_point() + facet_wrap(~model)
d_f %>%
select(deviance, SegSumT:LocSed) %>%
mutate(DSDam = factor(DSDam)) %>%
ggpairs(
lower = list(continuous = "cor"),
upper = list(continuous = wrap("points", alpha = 0.05))
)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
y = anguilla$presence %>% as.integer()
x = model.matrix(presence~.-1, data=anguilla)
xg = xgboost(data=x, label=y, nthead=4, nround=30, objective="binary:logistic")
## [1] train-error:0.100486
## [2] train-error:0.092382
## [3] train-error:0.079417
## [4] train-error:0.079417
## [5] train-error:0.069692
## [6] train-error:0.061588
## [7] train-error:0.055105
## [8] train-error:0.045381
## [9] train-error:0.040519
## [10] train-error:0.043760
## [11] train-error:0.038898
## [12] train-error:0.035656
## [13] train-error:0.029173
## [14] train-error:0.027553
## [15] train-error:0.024311
## [16] train-error:0.024311
## [17] train-error:0.019449
## [18] train-error:0.012966
## [19] train-error:0.009724
## [20] train-error:0.009724
## [21] train-error:0.011345
## [22] train-error:0.008104
## [23] train-error:0.004862
## [24] train-error:0.001621
## [25] train-error:0.001621
## [26] train-error:0.001621
## [27] train-error:0.001621
## [28] train-error:0.000000
## [29] train-error:0.000000
## [30] train-error:0.000000
pred_xg =
d_xg = anguilla %>%
mutate(p_hat = predict(xg, newdata=x)) %>%
mutate(p_hat_bin = p_hat - (p_hat %% 0.025)) %>%
mutate(
resid = presence - p_hat,
pearson = (presence - p_hat)/(p_hat*(1-p_hat)),
deviance = sign(presence-p_hat) * sqrt(-2 * (presence * log(p_hat) + (1-presence) * log(1-p_hat)))
)
resid_xg = d_xg %>%
select(p_hat:deviance) %>%
gather(type, value, -p_hat, -p_hat_bin) %>%
mutate(type = factor(type, levels=c("resid","pearson","deviance")))
ggplot(resid_xg, aes(x=p_hat, y=value, color=type)) +
geom_point(alpha=0.1) +
facet_wrap(~type, ncol=3, scale="free_y")
resid_f %>%
group_by(type, p_hat_bin) %>%
summarize(mean = mean(value)) %>%
ggplot(aes(x=p_hat_bin, y=mean, color=type)) +
geom_point() +
facet_wrap(~type, ncol=3, scale="free_y")
grid.arrange(
ggplot(d_g, aes(x=p_hat, y=factor(presence), color=factor(presence))) +
geom_jitter(height=0.2, alpha=0.5) + labs(title="simple"),
ggplot(d_f, aes(x=p_hat, y=factor(presence), color=factor(presence))) +
geom_jitter(height=0.2, alpha=0.5) + labs(title="full"),
ggplot(d_xg, aes(x=p_hat, y=factor(presence), color=factor(presence))) +
geom_jitter(height=0.2, alpha=0.5) + labs(title="xgboost")
)
score = function(p_hat, label, cutoff)
{
map_df(
cutoff,
~ data.frame(
sensitivity = sum(p_hat >= . & label) / length(p_hat),
specificity = sum(p_hat < . & !label) / length(p_hat)
)
)
}
score(d_f$p_hat, d_f$presence, seq(0,1,by=0.1)) %>%
ggplot(aes(x=specificity, y=sensitivity)) +
geom_point() +
geom_line()
score(d_f$p_hat, d_f$presence, seq(0,1,by=0.01)) %>%
ggplot(aes(x=1-specificity, y=sensitivity)) +
#geom_point() +
geom_line()
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
roc = function(d)
prediction(d$p_hat, d$presence) %>% performance(measure = "tpr", x.measure = "fpr")
auc = function(d)
{prediction(d$p_hat, d$presence) %>% performance(measure = "auc")}@y.values[[1]]
perf_g = roc(d_g)
perf_f = roc(d_f)
perf_xg = roc(d_xg)
plot(perf_g, col="#7fc97f", lwd=2)
plot(perf_f, col="#beaed4", lwd=2, add=TRUE)
plot(perf_xg, col="#fdc086", lwd=2, add=TRUE)
abline(a=0, b = 1, col="grey")
auc(d_g)
## [1] 0.7993274
auc(d_f)
## [1] 0.8694881
auc(d_xg)
## [1] 1
d_f_test = anguilla_test %>%
add_predictions(f, var="p_hat") %>%
mutate(p_hat = inv_logit(p_hat)) %>%
mutate(p_hat_bin = p_hat - (p_hat %% 0.025)) %>%
mutate(
resid = presence - p_hat,
pearson = (presence - p_hat)/(p_hat*(1-p_hat)),
deviance = sign(presence-p_hat) * sqrt(-2 * (presence * log(p_hat) + (1-presence) * log(1-p_hat)))
)
d_xg_test = anguilla_test %>%
mutate(p_hat = predict(xg, newdata=model.matrix(presence~.-1, data=anguilla_test))) %>%
mutate(p_hat_bin = p_hat - (p_hat %% 0.025)) %>%
mutate(
resid = presence - p_hat,
pearson = (presence - p_hat)/(p_hat*(1-p_hat)),
deviance = sign(presence-p_hat) * sqrt(-2 * (presence * log(p_hat) + (1-presence) * log(1-p_hat)))
)
grid.arrange(
ggplot(d_f_test, aes(x=p_hat, y=factor(presence), color=factor(presence))) +
geom_jitter(height=0.2, alpha=0.5) + labs(title="full"),
ggplot(d_xg_test, aes(x=p_hat, y=factor(presence), color=factor(presence))) +
geom_jitter(height=0.2, alpha=0.5) + labs(title="xgboost")
)
plot(roc(d_f_test), col="#beaed4", lwd=2)
plot(roc(d_xg_test), col="#fdc086", lwd=2, add=TRUE)
abline(a=0, b = 1, col="grey")
auc(d_f_test)
## [1] 0.8574315
auc(d_xg_test)
## [1] 0.8848485